{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 학습 파이프라인 디버깅(파이토치)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Install the Transformers, Datasets, and Evaluate libraries to run this notebook."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip install datasets evaluate transformers[sentencepiece]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'ValueError: You have to specify either input_ids or inputs_embeds'"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from datasets import load_dataset\n",
    "import evaluate\n",
    "from transformers import (\n",
    "    AutoTokenizer,\n",
    "    AutoModelForSequenceClassification,\n",
    "    TrainingArguments,\n",
    "    Trainer,\n",
    ")\n",
    "\n",
    "raw_datasets = load_dataset(\"glue\", \"mnli\")\n",
    "\n",
    "model_checkpoint = \"distilbert-base-uncased\"\n",
    "tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)\n",
    "\n",
    "\n",
    "def preprocess_function(examples):\n",
    "    return tokenizer(examples[\"premise\"], examples[\"hypothesis\"], truncation=True)\n",
    "\n",
    "\n",
    "tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)\n",
    "model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint)\n",
    "\n",
    "args = TrainingArguments(\n",
    "    f\"distilbert-finetuned-mnli\",\n",
    "    evaluation_strategy=\"epoch\",\n",
    "    save_strategy=\"epoch\",\n",
    "    learning_rate=2e-5,\n",
    "    num_train_epochs=3,\n",
    "    weight_decay=0.01,\n",
    ")\n",
    "\n",
    "metric = evaluate.load(\"glue\", \"mnli\")\n",
    "\n",
    "\n",
    "def compute_metrics(eval_pred):\n",
    "    predictions, labels = eval_pred\n",
    "    return metric.compute(predictions=predictions, references=labels)\n",
    "\n",
    "\n",
    "trainer = Trainer(\n",
    "    model,\n",
    "    args,\n",
    "    train_dataset=raw_datasets[\"train\"],\n",
    "    eval_dataset=raw_datasets[\"validation_matched\"],\n",
    "    compute_metrics=compute_metrics,\n",
    ")\n",
    "trainer.train()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'hypothesis': 'Product and geography are what make cream skimming work. ',\n",
       " 'idx': 0,\n",
       " 'label': 1,\n",
       " 'premise': 'Conceptually cream skimming has two basic dimensions - product and geography.'}"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "trainer.train_dataset[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'ValueError: expected sequence of length 43 at dim 1 (got 37)'"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from datasets import load_dataset\n",
    "import evaluate\n",
    "from transformers import (\n",
    "    AutoTokenizer,\n",
    "    AutoModelForSequenceClassification,\n",
    "    TrainingArguments,\n",
    "    Trainer,\n",
    ")\n",
    "\n",
    "raw_datasets = load_dataset(\"glue\", \"mnli\")\n",
    "\n",
    "model_checkpoint = \"distilbert-base-uncased\"\n",
    "tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)\n",
    "\n",
    "\n",
    "def preprocess_function(examples):\n",
    "    return tokenizer(examples[\"premise\"], examples[\"hypothesis\"], truncation=True)\n",
    "\n",
    "\n",
    "tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)\n",
    "model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint)\n",
    "\n",
    "args = TrainingArguments(\n",
    "    f\"distilbert-finetuned-mnli\",\n",
    "    evaluation_strategy=\"epoch\",\n",
    "    save_strategy=\"epoch\",\n",
    "    learning_rate=2e-5,\n",
    "    num_train_epochs=3,\n",
    "    weight_decay=0.01,\n",
    ")\n",
    "\n",
    "metric = evaluate.load(\"glue\", \"mnli\")\n",
    "\n",
    "\n",
    "def compute_metrics(eval_pred):\n",
    "    predictions, labels = eval_pred\n",
    "    return metric.compute(predictions=predictions, references=labels)\n",
    "\n",
    "\n",
    "trainer = Trainer(\n",
    "    model,\n",
    "    args,\n",
    "    train_dataset=tokenized_datasets[\"train\"],\n",
    "    eval_dataset=tokenized_datasets[\"validation_matched\"],\n",
    "    compute_metrics=compute_metrics,\n",
    ")\n",
    "trainer.train()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'[CLS] conceptually cream skimming has two basic dimensions - product and geography. [SEP] product and geography are what make cream skimming work. [SEP]'"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenizer.decode(trainer.train_dataset[0][\"input_ids\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "dict_keys(['attention_mask', 'hypothesis', 'idx', 'input_ids', 'label', 'premise'])"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "trainer.train_dataset[0].keys()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "transformers.models.distilbert.modeling_distilbert.DistilBertForSequenceClassification"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "type(trainer.model)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "trainer.train_dataset[0][\"attention_mask\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(trainer.train_dataset[0][\"attention_mask\"]) == len(\n",
    "    trainer.train_dataset[0][\"input_ids\"]\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "trainer.train_dataset[0][\"label\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['entailment', 'neutral', 'contradiction']"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "trainer.train_dataset.features[\"label\"].names"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "~/git/transformers/src/transformers/data/data_collator.py in torch_default_data_collator(features)\n",
       "    105                 batch[k] = torch.stack([f[k] for f in features])\n",
       "    106             else:\n",
       "--> 107                 batch[k] = torch.tensor([f[k] for f in features])\n",
       "    108 \n",
       "    109     return batch\n",
       "\n",
       "ValueError: expected sequence of length 45 at dim 1 (got 76)"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "for batch in trainer.get_train_dataloader():\n",
    "    break"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<function transformers.data.data_collator.default_data_collator(features: List[InputDataClass], return_tensors='pt') -> Dict[str, Any]>"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_collator = trainer.get_train_dataloader().collate_fn\n",
    "data_collator"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "RuntimeError: CUDA error: CUBLAS_STATUS_ALLOC_FAILED when calling `cublasCreate(handle)`"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from datasets import load_dataset\n",
    "import evaluate\n",
    "from transformers import (\n",
    "    AutoTokenizer,\n",
    "    AutoModelForSequenceClassification,\n",
    "    DataCollatorWithPadding,\n",
    "    TrainingArguments,\n",
    "    Trainer,\n",
    ")\n",
    "\n",
    "raw_datasets = load_dataset(\"glue\", \"mnli\")\n",
    "\n",
    "model_checkpoint = \"distilbert-base-uncased\"\n",
    "tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)\n",
    "\n",
    "\n",
    "def preprocess_function(examples):\n",
    "    return tokenizer(examples[\"premise\"], examples[\"hypothesis\"], truncation=True)\n",
    "\n",
    "\n",
    "tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)\n",
    "model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint)\n",
    "\n",
    "args = TrainingArguments(\n",
    "    f\"distilbert-finetuned-mnli\",\n",
    "    evaluation_strategy=\"epoch\",\n",
    "    save_strategy=\"epoch\",\n",
    "    learning_rate=2e-5,\n",
    "    num_train_epochs=3,\n",
    "    weight_decay=0.01,\n",
    ")\n",
    "\n",
    "metric = evaluate.load(\"glue\", \"mnli\")\n",
    "\n",
    "\n",
    "def compute_metrics(eval_pred):\n",
    "    predictions, labels = eval_pred\n",
    "    return metric.compute(predictions=predictions, references=labels)\n",
    "\n",
    "\n",
    "data_collator = DataCollatorWithPadding(tokenizer=tokenizer)\n",
    "\n",
    "trainer = Trainer(\n",
    "    model,\n",
    "    args,\n",
    "    train_dataset=tokenized_datasets[\"train\"],\n",
    "    eval_dataset=tokenized_datasets[\"validation_matched\"],\n",
    "    compute_metrics=compute_metrics,\n",
    "    data_collator=data_collator,\n",
    "    tokenizer=tokenizer,\n",
    ")\n",
    "trainer.train()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_collator = trainer.get_train_dataloader().collate_fn\n",
    "batch = data_collator([trainer.train_dataset[i] for i in range(4)])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_collator = trainer.get_train_dataloader().collate_fn\n",
    "actual_train_set = trainer._remove_unused_columns(trainer.train_dataset)\n",
    "batch = data_collator([actual_train_set[i] for i in range(4)])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for batch in trainer.get_train_dataloader():\n",
    "    break"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "~/.pyenv/versions/3.7.9/envs/base/lib/python3.7/site-packages/torch/nn/functional.py in nll_loss(input, target, weight, size_average, ignore_index, reduce, reduction)\n",
       "   2386         )\n",
       "   2387     if dim == 2:\n",
       "-> 2388         ret = torch._C._nn.nll_loss(input, target, weight, _Reduction.get_enum(reduction), ignore_index)\n",
       "   2389     elif dim == 4:\n",
       "   2390         ret = torch._C._nn.nll_loss2d(input, target, weight, _Reduction.get_enum(reduction), ignore_index)\n",
       "\n",
       "IndexError: Target 2 is out of bounds."
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "outputs = trainer.model.cpu()(**batch)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "2"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "trainer.model.config.num_labels"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from datasets import load_dataset\n",
    "import evaluate\n",
    "from transformers import (\n",
    "    AutoTokenizer,\n",
    "    AutoModelForSequenceClassification,\n",
    "    DataCollatorWithPadding,\n",
    "    TrainingArguments,\n",
    "    Trainer,\n",
    ")\n",
    "\n",
    "raw_datasets = load_dataset(\"glue\", \"mnli\")\n",
    "\n",
    "model_checkpoint = \"distilbert-base-uncased\"\n",
    "tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)\n",
    "\n",
    "\n",
    "def preprocess_function(examples):\n",
    "    return tokenizer(examples[\"premise\"], examples[\"hypothesis\"], truncation=True)\n",
    "\n",
    "\n",
    "tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)\n",
    "model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=3)\n",
    "\n",
    "args = TrainingArguments(\n",
    "    f\"distilbert-finetuned-mnli\",\n",
    "    evaluation_strategy=\"epoch\",\n",
    "    save_strategy=\"epoch\",\n",
    "    learning_rate=2e-5,\n",
    "    num_train_epochs=3,\n",
    "    weight_decay=0.01,\n",
    ")\n",
    "\n",
    "metric = evaluate.load(\"glue\", \"mnli\")\n",
    "\n",
    "\n",
    "def compute_metrics(eval_pred):\n",
    "    predictions, labels = eval_pred\n",
    "    return metric.compute(predictions=predictions, references=labels)\n",
    "\n",
    "\n",
    "data_collator = DataCollatorWithPadding(tokenizer=tokenizer)\n",
    "\n",
    "trainer = Trainer(\n",
    "    model,\n",
    "    args,\n",
    "    train_dataset=tokenized_datasets[\"train\"],\n",
    "    eval_dataset=tokenized_datasets[\"validation_matched\"],\n",
    "    compute_metrics=compute_metrics,\n",
    "    data_collator=data_collator,\n",
    "    tokenizer=tokenizer,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for batch in trainer.get_train_dataloader():\n",
    "    break\n",
    "\n",
    "outputs = trainer.model.cpu()(**batch)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import torch\n",
    "\n",
    "device = torch.device(\"cuda\") if torch.cuda.is_available() else torch.device(\"cpu\")\n",
    "batch = {k: v.to(device) for k, v in batch.items()}\n",
    "\n",
    "outputs = trainer.model.to(device)(**batch)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "loss = outputs.loss\n",
    "loss.backward()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "trainer.create_optimizer()\n",
    "trainer.optimizer.step()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "TypeError: only size-1 arrays can be converted to Python scalars"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# This will take a long time and error out, so you shouldn't run this cell\n",
    "trainer.train()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "TypeError: only size-1 arrays can be converted to Python scalars"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "trainer.evaluate()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for batch in trainer.get_eval_dataloader():\n",
    "    break\n",
    "\n",
    "batch = {k: v.to(device) for k, v in batch.items()}\n",
    "\n",
    "with torch.no_grad():\n",
    "    outputs = trainer.model(**batch)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "TypeError: only size-1 arrays can be converted to Python scalars"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "predictions = outputs.logits.cpu().numpy()\n",
    "labels = batch[\"labels\"].cpu().numpy()\n",
    "\n",
    "compute_metrics((predictions, labels))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "((8, 3), (8,))"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "predictions.shape, labels.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'accuracy': 0.625}"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import numpy as np\n",
    "\n",
    "\n",
    "def compute_metrics(eval_pred):\n",
    "    predictions, labels = eval_pred\n",
    "    predictions = np.argmax(predictions, axis=1)\n",
    "    return metric.compute(predictions=predictions, references=labels)\n",
    "\n",
    "\n",
    "compute_metrics((predictions, labels))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "from datasets import load_dataset\n",
    "import evaluate\n",
    "from transformers import (\n",
    "    AutoTokenizer,\n",
    "    AutoModelForSequenceClassification,\n",
    "    DataCollatorWithPadding,\n",
    "    TrainingArguments,\n",
    "    Trainer,\n",
    ")\n",
    "\n",
    "raw_datasets = load_dataset(\"glue\", \"mnli\")\n",
    "\n",
    "model_checkpoint = \"distilbert-base-uncased\"\n",
    "tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)\n",
    "\n",
    "\n",
    "def preprocess_function(examples):\n",
    "    return tokenizer(examples[\"premise\"], examples[\"hypothesis\"], truncation=True)\n",
    "\n",
    "\n",
    "tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)\n",
    "model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=3)\n",
    "\n",
    "args = TrainingArguments(\n",
    "    f\"distilbert-finetuned-mnli\",\n",
    "    evaluation_strategy=\"epoch\",\n",
    "    save_strategy=\"epoch\",\n",
    "    learning_rate=2e-5,\n",
    "    num_train_epochs=3,\n",
    "    weight_decay=0.01,\n",
    ")\n",
    "\n",
    "metric = evaluate.load(\"glue\", \"mnli\")\n",
    "\n",
    "\n",
    "def compute_metrics(eval_pred):\n",
    "    predictions, labels = eval_pred\n",
    "    predictions = np.argmax(predictions, axis=1)\n",
    "    return metric.compute(predictions=predictions, references=labels)\n",
    "\n",
    "\n",
    "data_collator = DataCollatorWithPadding(tokenizer=tokenizer)\n",
    "\n",
    "trainer = Trainer(\n",
    "    model,\n",
    "    args,\n",
    "    train_dataset=tokenized_datasets[\"train\"],\n",
    "    eval_dataset=tokenized_datasets[\"validation_matched\"],\n",
    "    compute_metrics=compute_metrics,\n",
    "    data_collator=data_collator,\n",
    "    tokenizer=tokenizer,\n",
    ")\n",
    "trainer.train()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for batch in trainer.get_train_dataloader():\n",
    "    break\n",
    "\n",
    "batch = {k: v.to(device) for k, v in batch.items()}\n",
    "trainer.create_optimizer()\n",
    "\n",
    "for _ in range(20):\n",
    "    outputs = trainer.model(**batch)\n",
    "    loss = outputs.loss\n",
    "    loss.backward()\n",
    "    trainer.optimizer.step()\n",
    "    trainer.optimizer.zero_grad()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'accuracy': 1.0}"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "with torch.no_grad():\n",
    "    outputs = trainer.model(**batch)\n",
    "preds = outputs.logits\n",
    "labels = batch[\"labels\"]\n",
    "\n",
    "compute_metrics((preds.cpu().numpy(), labels.cpu().numpy()))"
   ]
  }
 ],
 "metadata": {
  "colab": {
   "name": "학습 파이프라인 디버깅(파이토치)",
   "provenance": []
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
